## OGR data source with driver: ESRI Shapefile
## Source: "/home/dflynn-volpe/workingdata", layer: "hexagons_1mi_routes_AADT_total_sum"
## with 5201 features
## It has 4 fields
## OGR data source with driver: ESRI Shapefile
## Source: "/home/dflynn-volpe/workingdata", layer: "hexagons_1mi_routes_sum"
## with 8111 features
## It has 3 fields
## OGR data source with driver: ESRI Shapefile
## Source: "/home/dflynn-volpe/workingdata", layer: "hexagons_1mi_bg_lodes_sum"
## with 33184 features
## It has 32 fields
## OGR data source with driver: ESRI Shapefile
## Source: "/home/dflynn-volpe/workingdata", layer: "hexagons_1mi_bg_rac_sum"
## with 33184 features
## It has 32 fields
Actual sample size imbalance:
kable(w.04_09 %>% group_by(MatchEDT_buffer) %>% count(), caption = "Count of EDT crash presence/absence over April-September 2017, Maryland")
| MatchEDT_buffer | n |
|---|---|
| 0 | 1903966 |
| 1 | 99425 |
Imbalance has a temporal component, with more Waze records and more matching EDT crashes occurring in commuting times.
# long format using dplyr
hr.1 <- w.04_09 %>%
group_by(hour, MatchEDT_buffer) %>%
summarise(n = n()) %>%
mutate(freq = round(100 * n / sum(n), 2))
kable(hr.1, caption = "Percent of EDT crash presence/absence records by hour over April-September 2017, Maryland")
| hour | MatchEDT_buffer | n | freq |
|---|---|---|---|
| 0 | 0 | 30518 | 96.74 |
| 0 | 1 | 1029 | 3.26 |
| 1 | 0 | 23108 | 97.15 |
| 1 | 1 | 679 | 2.85 |
| 2 | 0 | 18995 | 97.33 |
| 2 | 1 | 521 | 2.67 |
| 3 | 0 | 18458 | 97.63 |
| 3 | 1 | 449 | 2.37 |
| 4 | 0 | 24427 | 97.54 |
| 4 | 1 | 616 | 2.46 |
| 5 | 0 | 41959 | 97.38 |
| 5 | 1 | 1130 | 2.62 |
| 6 | 0 | 70404 | 96.56 |
| 6 | 1 | 2511 | 3.44 |
| 7 | 0 | 107983 | 95.54 |
| 7 | 1 | 5044 | 4.46 |
| 8 | 0 | 120367 | 94.94 |
| 8 | 1 | 6420 | 5.06 |
| 9 | 0 | 109740 | 95.12 |
| 9 | 1 | 5625 | 4.88 |
| 10 | 0 | 100484 | 95.84 |
| 10 | 1 | 4365 | 4.16 |
| 11 | 0 | 102299 | 95.97 |
| 11 | 1 | 4298 | 4.03 |
| 12 | 0 | 105513 | 95.58 |
| 12 | 1 | 4874 | 4.42 |
| 13 | 0 | 107929 | 95.15 |
| 13 | 1 | 5506 | 4.85 |
| 14 | 0 | 113201 | 94.49 |
| 14 | 1 | 6606 | 5.51 |
| 15 | 0 | 125570 | 93.64 |
| 15 | 1 | 8523 | 6.36 |
| 16 | 0 | 136768 | 93.22 |
| 16 | 1 | 9941 | 6.78 |
| 17 | 0 | 140513 | 93.25 |
| 17 | 1 | 10167 | 6.75 |
| 18 | 0 | 116014 | 93.86 |
| 18 | 1 | 7592 | 6.14 |
| 19 | 0 | 85069 | 94.78 |
| 19 | 1 | 4687 | 5.22 |
| 20 | 0 | 66471 | 95.65 |
| 20 | 1 | 3025 | 4.35 |
| 21 | 0 | 55378 | 95.87 |
| 21 | 1 | 2386 | 4.13 |
| 22 | 0 | 46033 | 95.90 |
| 22 | 1 | 1967 | 4.10 |
| 23 | 0 | 36765 | 96.17 |
| 23 | 1 | 1464 | 3.83 |
Minimum and maximum imbalance are shown on the figure.
hr.1$text <- paste("Hour:", hr.1$hour, "\n EDT crash:", hr.1$MatchEDT_buffer,
"\n Frequency:", round(hr.1$freq, 2))
gp <- ggplot(hr.1, aes(x = hour,
y = freq,
group = MatchEDT_buffer,
text = text)) +
# geom_point() +
ylab("Frequency") + xlab("Hour of Day") + theme_bw() +
ggtitle("Class imbalance by time") +
geom_step(aes(color = MatchEDT_buffer), lwd = 2)
# gp
maxval <- hr.1 %>%
group_by(MatchEDT_buffer) %>%
summarise(max.freq = max(freq),
hr.max = hour[which(freq == max.freq)],
min.freq = min(freq),
hr.min= hour[which(freq == min.freq)])
plot.freq = c(maxval$max.freq, maxval$min.freq) + c(-3, +3, -3, +3)
gp2 <- gp + annotate("text",
x = c(maxval$hr.max, maxval$hr.min),
y = plot.freq,
label = round(c(maxval$max.freq, maxval$min.freq), 2)) +
guides(color=guide_legend(title="EDT crash class"))
# gp2
ggplotly(gp2, tooltip = "text") %>% layout(hovermode = 'x')
load(file.path(localdir, "Output_to_30_week"))
load(file.path(localdir, "Outputs_strata_test"))
tabl <- vector()
mods = c(names(keyoutputs)[4], names(keyoutputs_strata))
for(i in 1:length(mods)){
if(i == 1){
tabl <- rbind(tabl, c(100*keyoutputs[[mods[i]]]$diag, round(keyoutputs[[mods[i]]]$auc, 4)))
} else {
tabl <- rbind(tabl, c(100*keyoutputs_strata[[mods[i]]]$diag, round(as.numeric(keyoutputs_strata[[mods[i]]]$auc), 4)))
}
}
tabl <- as.data.frame(tabl)
colnames(tabl)[1:5] = c("Accuracy", "Precision", "Recall", "False Positive Rate", "AUC")
tabl$cutoff = c(.2, .5, .5, .5, .2, .2, .2)
tabl$sampling = c("None", "10:1", "5:1", "1:1", "10:1", "5:1", "1:1")
rownames(tabl) = mods
datatable(tabl, filter = 'top',
caption = "Testing stratification and cutoffs for Model 30",
rownames = T,
options = list(dom = "t",
pageLength = length(mods))
) %>% formatCurrency(2:4, currency = "", digits = 2) %>%
formatStyle(1, background = styleEqual(max(tabl[,1]), 'lightgreen'))%>%
formatStyle(2, background = styleEqual(max(tabl[,2]), 'lightgreen'))%>%
formatStyle(3, background = styleEqual(max(tabl[,3]), 'lightgreen'))%>%
formatStyle(4, background = styleEqual(min(tabl[,4]), 'lightgreen'))%>%
formatStyle(5, background = styleEqual(max(tabl[,5]), 'lightgreen'))
Variable importance:
s3load(object = file.path(outputdir, paste("Model_30_original_RandomForest_Output.RData", sep= "_")), bucket = waze.bucket)
rf.out.og <- rf.out
s3load(object = file.path(outputdir, "Stratification_test_RandomForest_Output.RData"), bucket = waze.bucket)
par(mfrow=c(2,2), mar = c(5,3, 3, 1))
varImpPlot(rf.out.og,
main = "Model 30 Variable Importance \n Original",
n.var = 15,
cex = 0.6)
varImpPlot(rf.30_10,
main = "Model 30 Variable Importance \n 10:1",
n.var = 15,
cex = 0.6)
varImpPlot(rf.30_05,
main = "Model 30 Variable Importance \n 5:1",
n.var = 15,
cex = 0.6)
varImpPlot(rf.30_01,
main = "Model 30 Variable Importance \n 1:1",
n.var = 15,
cex = 0.6)